#load necessary libraries
library(xgboost)
library(caret)
train=readRDS("train.rds")
test=readRDS("test.rds")
print(train)
print(test)
#train:test -> 80%:20%
train_train=train[1:(0.8*dim(train)[1]),]
test_train=train[(0.8*dim(train)[1]+1):(dim(train)[1]),]

Use caret to find the best hyperparameters using 5-fold cv

#set up grid to choose the best values for the following parameters
control <-trainControl(method="cv", number=5)
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1)
xgb_caret <- train(x=train_train[-230], y=train_train[,230], method='xgbTree', trControl= control, tuneGrid=xgb_grid) 
xgb_caret$bestTune

#From the run, the best tune parameters are max depth = 4, eta = 0.1 and min_child_weight = 4

train_labels <- train_train[,230]
# put our testing & training data into two seperates Dmatrixs objects
dtrain <- xgb.DMatrix(data = as.matrix(train_train[,-230]), label= train_labels)
dtest <- xgb.DMatrix(data = as.matrix(test_train[,-230]))
#Using the best parameters from the tune
parameters <-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.1, #default = 0.3
        gamma=0,
        max_depth=4, #default=6
        min_child_weight=4, #default=1
        subsample=1,
        colsample_bytree=1
)
#cross validation using the inbuild xgb.cv() to find the best no of rounds. 
set.seed(123)
xgbcv <- xgb.cv( params = parameters, data = dtrain, nrounds = 10000, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
[1] train-rmse:11.413846+0.007614   test-rmse:11.413752+0.030885 
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.

[41]    train-rmse:7.644597+0.005097    test-rmse:7.644520+0.032602 
[81]    train-rmse:5.122702+0.003400    test-rmse:5.123048+0.029993 
[121]   train-rmse:3.435296+0.002239    test-rmse:3.435759+0.026427 
[161]   train-rmse:2.306322+0.001437    test-rmse:2.307605+0.022512 
[201]   train-rmse:1.550925+0.000961    test-rmse:1.552953+0.018722 
[241]   train-rmse:1.045665+0.000752    test-rmse:1.048705+0.015871 
[281]   train-rmse:0.708045+0.000700    test-rmse:0.713113+0.013718 
[321]   train-rmse:0.482904+0.000802    test-rmse:0.491152+0.012104 
[361]   train-rmse:0.333550+0.000965    test-rmse:0.345948+0.010399 
[401]   train-rmse:0.235578+0.001194    test-rmse:0.253653+0.009072 
[441]   train-rmse:0.172388+0.001377    test-rmse:0.197499+0.008986 
[481]   train-rmse:0.132697+0.001564    test-rmse:0.165074+0.010620 
[521]   train-rmse:0.108632+0.001682    test-rmse:0.147400+0.012964 
[561]   train-rmse:0.094332+0.001771    test-rmse:0.138112+0.015068 
[601]   train-rmse:0.085758+0.001864    test-rmse:0.133278+0.016492 
[641]   train-rmse:0.080401+0.001925    test-rmse:0.130593+0.017600 
[681]   train-rmse:0.076742+0.001888    test-rmse:0.129021+0.018262 
[721]   train-rmse:0.074035+0.001825    test-rmse:0.127961+0.018783 
[761]   train-rmse:0.071941+0.001759    test-rmse:0.127210+0.019097 
[801]   train-rmse:0.070175+0.001732    test-rmse:0.126631+0.019305 
[841]   train-rmse:0.068606+0.001697    test-rmse:0.126181+0.019362 
[881]   train-rmse:0.067198+0.001641    test-rmse:0.125828+0.019394 
[921]   train-rmse:0.065869+0.001604    test-rmse:0.125473+0.019420 
[961]   train-rmse:0.064626+0.001572    test-rmse:0.125227+0.019502 
[1001]  train-rmse:0.063460+0.001568    test-rmse:0.124946+0.019540 
[1041]  train-rmse:0.062388+0.001617    test-rmse:0.124762+0.019490 
[1081]  train-rmse:0.061345+0.001632    test-rmse:0.124658+0.019473 
[1121]  train-rmse:0.060370+0.001645    test-rmse:0.124539+0.019446 
[1161]  train-rmse:0.059343+0.001610    test-rmse:0.124447+0.019409 
[1201]  train-rmse:0.058397+0.001612    test-rmse:0.124355+0.019398 
[1241]  train-rmse:0.057484+0.001613    test-rmse:0.124269+0.019407 
[1281]  train-rmse:0.056640+0.001594    test-rmse:0.124162+0.019396 
[1321]  train-rmse:0.055808+0.001611    test-rmse:0.124112+0.019350 
Stopping. Best iteration:
[1313]  train-rmse:0.055956+0.001618    test-rmse:0.124112+0.019363
#based on the best tune parameters and nrounds = 1032
xgb_mod <- xgb.train(data = dtrain, params= parameters, nrounds = 1032)
XGBpred <- predict(xgb_mod, dtest)
head(XGBpred)
[1] 12.13067 13.05577 11.84750 12.02057 11.98641 12.43907
predictions_XGB <-XGBpred #need to reverse the log to the real values
#evaluation of results
cor(predictions_XGB,test_train[,230])
[1] 0.9417143
rmse(test_train[,230],predictions_XGB)
[1] 0.1320425
#visualizing the results
plot(exp(predictions_XGB),exp(test_train[,230]),xlab="Predicted Label",ylab="Actual Label",main="Plot of Actual Against Predicted Labels")
lin.mod=lm(exp(test_train[,230])~exp(predictions_XGB))
pr.lm=predict(lin.mod)
lines(pr.lm~exp(predictions_XGB), col="blue", lwd=0.5)
lines(c(0,450000), c(0,450000))
legend("topleft", legend=c("fitted line", "45 degree line"),col=c("blue", "black"), lty=1, cex=0.8)

#view variable importance plot
#install.packages("Ckmeans.1d.dp")
library(Ckmeans.1d.dp) #required for ggplot clustering
mat <- xgb.importance (feature_names = colnames(train_train[,-230]),model = xgb_mod)
xgb.ggplot.importance(importance_matrix = mat[1:20], rel_to_first = TRUE)

LS0tDQp0aXRsZTogIlhHQm9vc3QiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCiNsb2FkIG5lY2Vzc2FyeSBsaWJyYXJpZXMNCmxpYnJhcnkoeGdib29zdCkNCmxpYnJhcnkoY2FyZXQpDQpgYGANCg0KYGBge3J9DQp0cmFpbj1yZWFkUkRTKCJ0cmFpbi5yZHMiKQ0KdGVzdD1yZWFkUkRTKCJ0ZXN0LnJkcyIpDQpwcmludCh0cmFpbikNCnByaW50KHRlc3QpDQoNCiN0cmFpbjp0ZXN0IC0+IDgwJToyMCUNCnRyYWluX3RyYWluPXRyYWluWzE6KDAuOCpkaW0odHJhaW4pWzFdKSxdDQp0ZXN0X3RyYWluPXRyYWluWygwLjgqZGltKHRyYWluKVsxXSsxKTooZGltKHRyYWluKVsxXSksXQ0KYGBgDQoNClVzZSBjYXJldCB0byBmaW5kIHRoZSBiZXN0IGh5cGVycGFyYW1ldGVycyB1c2luZyA1LWZvbGQgY3YNCmBgYHtyfQ0KI3NldCB1cCBncmlkIHRvIGNob29zZSB0aGUgYmVzdCB2YWx1ZXMgZm9yIHRoZSBmb2xsb3dpbmcgcGFyYW1ldGVycw0KDQpjb250cm9sIDwtdHJhaW5Db250cm9sKG1ldGhvZD0iY3YiLCBudW1iZXI9NSkNCg0KeGdiX2dyaWQgPSBleHBhbmQuZ3JpZCgNCm5yb3VuZHMgPSAxMDAwLA0KZXRhID0gYygwLjEsIDAuMDUsIDAuMDEpLA0KbWF4X2RlcHRoID0gYygyLCAzLCA0LCA1LCA2KSwNCmdhbW1hID0gMCwNCmNvbHNhbXBsZV9ieXRyZWU9MSwNCm1pbl9jaGlsZF93ZWlnaHQ9YygxLCAyLCAzLCA0ICw1KSwNCnN1YnNhbXBsZT0xKQ0KDQp4Z2JfY2FyZXQgPC0gdHJhaW4oeD10cmFpbl90cmFpblstMjMwXSwgeT10cmFpbl90cmFpblssMjMwXSwgbWV0aG9kPSd4Z2JUcmVlJywgdHJDb250cm9sPSBjb250cm9sLCB0dW5lR3JpZD14Z2JfZ3JpZCkgDQp4Z2JfY2FyZXQkYmVzdFR1bmUNCg0KYGBgDQojRnJvbSB0aGUgcnVuLCB0aGUgYmVzdCB0dW5lIHBhcmFtZXRlcnMgYXJlIG1heCBkZXB0aCA9IDQsIGV0YSA9IDAuMSBhbmQgbWluX2NoaWxkX3dlaWdodCA9ICA0DQoNCmBgYHtyfQ0KdHJhaW5fbGFiZWxzIDwtIHRyYWluX3RyYWluWywyMzBdDQoNCiMgcHV0IG91ciB0ZXN0aW5nICYgdHJhaW5pbmcgZGF0YSBpbnRvIHR3byBzZXBlcmF0ZXMgRG1hdHJpeHMgb2JqZWN0cw0KZHRyYWluIDwtIHhnYi5ETWF0cml4KGRhdGEgPSBhcy5tYXRyaXgodHJhaW5fdHJhaW5bLC0yMzBdKSwgbGFiZWw9IHRyYWluX2xhYmVscykNCmR0ZXN0IDwtIHhnYi5ETWF0cml4KGRhdGEgPSBhcy5tYXRyaXgodGVzdF90cmFpblssLTIzMF0pKQ0KYGBgDQoNCmBgYHtyfQ0KI1VzaW5nIHRoZSBiZXN0IHBhcmFtZXRlcnMgZnJvbSB0aGUgdHVuZQ0KcGFyYW1ldGVycyA8LWxpc3QoDQogICAgICAgIG9iamVjdGl2ZSA9ICJyZWc6bGluZWFyIiwNCiAgICAgICAgYm9vc3RlciA9ICJnYnRyZWUiLA0KICAgICAgICBldGE9MC4xLCAjZGVmYXVsdCA9IDAuMw0KICAgICAgICBnYW1tYT0wLA0KICAgICAgICBtYXhfZGVwdGg9NCwgI2RlZmF1bHQ9Ng0KICAgICAgICBtaW5fY2hpbGRfd2VpZ2h0PTQsICNkZWZhdWx0PTENCiAgICAgICAgc3Vic2FtcGxlPTEsDQogICAgICAgIGNvbHNhbXBsZV9ieXRyZWU9MQ0KKQ0KYGBgDQoNCmBgYHtyfQ0KI2Nyb3NzIHZhbGlkYXRpb24gdXNpbmcgdGhlIGluYnVpbGQgeGdiLmN2KCkgdG8gZmluZCB0aGUgYmVzdCBubyBvZiByb3VuZHMuIA0Kc2V0LnNlZWQoMTIzKQ0KeGdiY3YgPC0geGdiLmN2KCBwYXJhbXMgPSBwYXJhbWV0ZXJzLCBkYXRhID0gZHRyYWluLCBucm91bmRzID0gMTAwMDAsIG5mb2xkID0gNSwgc2hvd3NkID0gVCwgc3RyYXRpZmllZCA9IFQsIHByaW50X2V2ZXJ5X24gPSA0MCwgZWFybHlfc3RvcHBpbmdfcm91bmRzID0gMTAsIG1heGltaXplID0gRikNCmBgYA0KDQpgYGB7cn0NCiNiYXNlZCBvbiB0aGUgYmVzdCB0dW5lIHBhcmFtZXRlcnMgYW5kIG5yb3VuZHMgPSAxMDMyDQp4Z2JfbW9kIDwtIHhnYi50cmFpbihkYXRhID0gZHRyYWluLCBwYXJhbXM9IHBhcmFtZXRlcnMsIG5yb3VuZHMgPSAxMDMyKQ0KDQpgYGANCg0KYGBge3J9DQpYR0JwcmVkIDwtIHByZWRpY3QoeGdiX21vZCwgZHRlc3QpDQpoZWFkKFhHQnByZWQpDQoNCnByZWRpY3Rpb25zX1hHQiA8LVhHQnByZWQgI25lZWQgdG8gcmV2ZXJzZSB0aGUgbG9nIHRvIHRoZSByZWFsIHZhbHVlcw0KDQpgYGANCg0KYGBge3J9DQojZXZhbHVhdGlvbiBvZiByZXN1bHRzDQpjb3IocHJlZGljdGlvbnNfWEdCLHRlc3RfdHJhaW5bLDIzMF0pDQpybXNlKHRlc3RfdHJhaW5bLDIzMF0scHJlZGljdGlvbnNfWEdCKQ0KYGBgDQoNCmBgYHtyfQ0KI3Zpc3VhbGl6aW5nIHRoZSByZXN1bHRzDQpwbG90KGV4cChwcmVkaWN0aW9uc19YR0IpLGV4cCh0ZXN0X3RyYWluWywyMzBdKSx4bGFiPSJQcmVkaWN0ZWQgTGFiZWwiLHlsYWI9IkFjdHVhbCBMYWJlbCIsbWFpbj0iUGxvdCBvZiBBY3R1YWwgQWdhaW5zdCBQcmVkaWN0ZWQgTGFiZWxzIikNCmxpbi5tb2Q9bG0oZXhwKHRlc3RfdHJhaW5bLDIzMF0pfmV4cChwcmVkaWN0aW9uc19YR0IpKQ0KcHIubG09cHJlZGljdChsaW4ubW9kKQ0KbGluZXMocHIubG1+ZXhwKHByZWRpY3Rpb25zX1hHQiksIGNvbD0iYmx1ZSIsIGx3ZD0wLjUpDQpsaW5lcyhjKDAsNDUwMDAwKSwgYygwLDQ1MDAwMCkpDQoNCmxlZ2VuZCgidG9wbGVmdCIsIGxlZ2VuZD1jKCJmaXR0ZWQgbGluZSIsICI0NSBkZWdyZWUgbGluZSIpLGNvbD1jKCJibHVlIiwgImJsYWNrIiksIGx0eT0xLCBjZXg9MC44KQ0KYGBgDQoNCmBgYHtyLCBvdXQud2lkdGg9IjEwMCUifQ0KI3ZpZXcgdmFyaWFibGUgaW1wb3J0YW5jZSBwbG90DQojaW5zdGFsbC5wYWNrYWdlcygiQ2ttZWFucy4xZC5kcCIpDQpsaWJyYXJ5KENrbWVhbnMuMWQuZHApICNyZXF1aXJlZCBmb3IgZ2dwbG90IGNsdXN0ZXJpbmcNCm1hdCA8LSB4Z2IuaW1wb3J0YW5jZSAoZmVhdHVyZV9uYW1lcyA9IGNvbG5hbWVzKHRyYWluX3RyYWluWywtMjMwXSksbW9kZWwgPSB4Z2JfbW9kKQ0KeGdiLmdncGxvdC5pbXBvcnRhbmNlKGltcG9ydGFuY2VfbWF0cml4ID0gbWF0WzE6MjBdLCByZWxfdG9fZmlyc3QgPSBUUlVFKQ0KYGBgDQo=